# Анализ данных по Ковид 19

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
# загрузка данных
data =  pd.read_csv('covid_19_data.csv')
data.head()

Unnamed: 0,SNo,ObservationDate,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,1,01/22/2020,Anhui,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
1,2,01/22/2020,Beijing,Mainland China,1/22/2020 17:00,14.0,0.0,0.0
2,3,01/22/2020,Chongqing,Mainland China,1/22/2020 17:00,6.0,0.0,0.0
3,4,01/22/2020,Fujian,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
4,5,01/22/2020,Gansu,Mainland China,1/22/2020 17:00,0.0,0.0,0.0


In [4]:
data.shape

(156292, 8)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156292 entries, 0 to 156291
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   SNo              156292 non-null  int64  
 1   ObservationDate  156292 non-null  object 
 2   Province/State   111979 non-null  object 
 3   Country/Region   156292 non-null  object 
 4   Last Update      156292 non-null  object 
 5   Confirmed        156292 non-null  float64
 6   Deaths           156292 non-null  float64
 7   Recovered        156292 non-null  float64
dtypes: float64(3), int64(1), object(4)
memory usage: 9.5+ MB


In [6]:
# проверка ключей
data.columns

Index(['SNo', 'ObservationDate', 'Province/State', 'Country/Region',
       'Last Update', 'Confirmed', 'Deaths', 'Recovered'],
      dtype='object')

In [8]:
# удаление ненужных столбцов по ключам
# 1 способ
data=data.drop(['SNo', 'Last Update'], axis=1)

# 2 способ (без присвоения)
#data.drop(['SNo', 'Last Update'], axis=1, inplace=True)

data.head()

Unnamed: 0,ObservationDate,Province/State,Country/Region,Confirmed,Deaths,Recovered
0,01/22/2020,Anhui,Mainland China,1.0,0.0,0.0
1,01/22/2020,Beijing,Mainland China,14.0,0.0,0.0
2,01/22/2020,Chongqing,Mainland China,6.0,0.0,0.0
3,01/22/2020,Fujian,Mainland China,1.0,0.0,0.0
4,01/22/2020,Gansu,Mainland China,0.0,0.0,0.0


In [10]:
#проверка на дубликаты
data.duplicated(['ObservationDate','Country/Region','Province/State'])

0         False
1         False
2         False
3         False
4         False
          ...  
156287    False
156288    False
156289    False
156290    False
156291    False
Length: 156292, dtype: bool

In [11]:
#проверка на дубликаты
data.duplicated(['ObservationDate','Country/Region','Province/State']).sum()

4

In [12]:
#удаление дубликатов
data=data.drop_duplicates(['ObservationDate','Country/Region','Province/State'])

In [13]:
data.duplicated(['ObservationDate','Country/Region','Province/State']).sum()

0

In [14]:
#изучение страны
data['Country/Region'].unique()

array(['Mainland China', 'Hong Kong', 'Macau', 'Taiwan', 'US', 'Japan',
       'Thailand', 'South Korea', 'Singapore', 'Philippines', 'Malaysia',
       'Vietnam', 'Australia', 'Mexico', 'Brazil', 'Colombia', 'France',
       'Nepal', 'Canada', 'Cambodia', 'Sri Lanka', 'Ivory Coast',
       'Germany', 'Finland', 'United Arab Emirates', 'India', 'Italy',
       'UK', 'Russia', 'Sweden', 'Spain', 'Belgium', 'Others', 'Egypt',
       'Iran', 'Israel', 'Lebanon', 'Iraq', 'Oman', 'Afghanistan',
       'Bahrain', 'Kuwait', 'Austria', 'Algeria', 'Croatia',
       'Switzerland', 'Pakistan', 'Georgia', 'Greece', 'North Macedonia',
       'Norway', 'Romania', 'Denmark', 'Estonia', 'Netherlands',
       'San Marino', ' Azerbaijan', 'Belarus', 'Iceland', 'Lithuania',
       'New Zealand', 'Nigeria', 'North Ireland', 'Ireland', 'Luxembourg',
       'Monaco', 'Qatar', 'Ecuador', 'Azerbaijan', 'Czech Republic',
       'Armenia', 'Dominican Republic', 'Indonesia', 'Portugal',
       'Andorra', 'Latvia

In [15]:
country_array=data['Country/Region'].unique()
for country in country_array:
    print(country)

Mainland China
Hong Kong
Macau
Taiwan
US
Japan
Thailand
South Korea
Singapore
Philippines
Malaysia
Vietnam
Australia
Mexico
Brazil
Colombia
France
Nepal
Canada
Cambodia
Sri Lanka
Ivory Coast
Germany
Finland
United Arab Emirates
India
Italy
UK
Russia
Sweden
Spain
Belgium
Others
Egypt
Iran
Israel
Lebanon
Iraq
Oman
Afghanistan
Bahrain
Kuwait
Austria
Algeria
Croatia
Switzerland
Pakistan
Georgia
Greece
North Macedonia
Norway
Romania
Denmark
Estonia
Netherlands
San Marino
 Azerbaijan
Belarus
Iceland
Lithuania
New Zealand
Nigeria
North Ireland
Ireland
Luxembourg
Monaco
Qatar
Ecuador
Azerbaijan
Czech Republic
Armenia
Dominican Republic
Indonesia
Portugal
Andorra
Latvia
Morocco
Saudi Arabia
Senegal
Argentina
Chile
Jordan
Ukraine
Saint Barthelemy
Hungary
Faroe Islands
Gibraltar
Liechtenstein
Poland
Tunisia
Palestine
Bosnia and Herzegovina
Slovenia
South Africa
Bhutan
Cameroon
Costa Rica
Peru
Serbia
Slovakia
Togo
Vatican City
French Guiana
Malta
Martinique
Republic of Ireland
Bulgaria
Maldives
Ba

In [16]:
country_array=data['Country/Region'].unique()
for country in np.sort(country_array):
    print(country)

 Azerbaijan
('St. Martin',)
Afghanistan
Albania
Algeria
Andorra
Angola
Antigua and Barbuda
Argentina
Armenia
Aruba
Australia
Austria
Azerbaijan
Bahamas
Bahamas, The
Bahrain
Bangladesh
Barbados
Belarus
Belgium
Belize
Benin
Bhutan
Bolivia
Bosnia and Herzegovina
Botswana
Brazil
Brunei
Bulgaria
Burkina Faso
Burma
Burundi
Cabo Verde
Cambodia
Cameroon
Canada
Cape Verde
Cayman Islands
Central African Republic
Chad
Channel Islands
Chile
Colombia
Comoros
Congo (Brazzaville)
Congo (Kinshasa)
Costa Rica
Croatia
Cuba
Curacao
Cyprus
Czech Republic
Denmark
Diamond Princess
Djibouti
Dominica
Dominican Republic
East Timor
Ecuador
Egypt
El Salvador
Equatorial Guinea
Eritrea
Estonia
Eswatini
Ethiopia
Faroe Islands
Fiji
Finland
France
French Guiana
Gabon
Gambia
Gambia, The
Georgia
Germany
Ghana
Gibraltar
Greece
Greenland
Grenada
Guadeloupe
Guam
Guatemala
Guernsey
Guinea
Guinea-Bissau
Guyana
Haiti
Holy See
Honduras
Hong Kong
Hungary
Iceland
India
Indonesia
Iran
Iraq
Ireland
Israel
Italy
Ivory Coast
Jamaic

In [17]:
#Изучаем Others
# применяем маскирование
data[data['Country/Region']=='Others']

Unnamed: 0,ObservationDate,Province/State,Country/Region,Confirmed,Deaths,Recovered
933,02/07/2020,Cruise Ship,Others,61.0,0.0,0.0
1005,02/08/2020,Cruise Ship,Others,61.0,0.0,0.0
1077,02/09/2020,Diamond Princess cruise ship,Others,64.0,0.0,0.0
1143,02/10/2020,Diamond Princess cruise ship,Others,135.0,0.0,0.0
1215,02/11/2020,Diamond Princess cruise ship,Others,135.0,0.0,0.0
1286,02/12/2020,Diamond Princess cruise ship,Others,175.0,0.0,0.0
1359,02/13/2020,Diamond Princess cruise ship,Others,175.0,0.0,0.0
1433,02/14/2020,Diamond Princess cruise ship,Others,218.0,0.0,0.0
1506,02/15/2020,Diamond Princess cruise ship,Others,285.0,0.0,0.0
1578,02/16/2020,Diamond Princess cruise ship,Others,355.0,0.0,0.0


In [18]:
# приводим в порядок даты
data['ObservationDate']

0         01/22/2020
1         01/22/2020
2         01/22/2020
3         01/22/2020
4         01/22/2020
             ...    
156287    11/15/2020
156288    11/15/2020
156289    11/15/2020
156290    11/15/2020
156291    11/15/2020
Name: ObservationDate, Length: 156288, dtype: object

In [19]:
data['ObservationDate'][0]

'01/22/2020'

In [21]:
# Пример метода обработки конвертации даты
pd.to_datetime('01/22/2020').date()

datetime.date(2020, 1, 22)

In [22]:
pd.to_datetime('01/22/2020')

Timestamp('2020-01-22 00:00:00')