## Data cleaning

In [1]:
## Required packages loaded
import pandas as pd
import numpy as np
import re
import difflib
import seaborn as sns #for plotting graphs
import matplotlib.pyplot as plt #for plotting graphs

### Covid_19 government measures dataset

In [2]:
# Load the data from excel file into a data frame
xlsx = pd.ExcelFile('../datasets/covid19_government_measures_dataset.xlsx')
gov_measures = pd.read_excel(xlsx, 'Dataset')

In [3]:
# Check the dataframe
gov_measures.head()

Unnamed: 0,ID,ISO,COUNTRY,REGION,ADMIN_LEVEL_NAME,PCODE,LOG_TYPE,CATEGORY,MEASURE,TARGETED_POP_GROUP,COMMENTS,NON_COMPLIANCE,DATE_IMPLEMENTED,SOURCE,SOURCE_TYPE,LINK,ENTRY_DATE,Alternative source
0,4245,AFG,Afghanistan,Asia,,,Introduction / extension of measures,Public health measures,Awareness campaigns,,MoPH begins announcements on their facebook to...,,2020-01-24,Afghanistan MoPH,Government,https://www.facebook.com/af.moph/posts/1005130...,2020-04-07,
1,4246,AFG,Afghanistan,Asia,,,Introduction / extension of measures,Public health measures,Health screenings in airports and border cross...,checked,Health teams at airports will check passengers...,,2020-01-26,Afghanistan MoPH,Government,https://www.facebook.com/af.moph/posts/1006458...,2020-04-07,
2,4247,AFG,Afghanistan,Asia,,,Introduction / extension of measures,Movement restrictions,International flights suspension,checked,Flights to China are suspended.,,2020-01-27,Afghanistan MoPH,Government,https://www.facebook.com/af.moph/photos/a.4625...,2020-04-07,
3,4248,AFG,Afghanistan,Asia,,,Introduction / extension of measures,Public health measures,Health screenings in airports and border cross...,,Health screenings of all passengers at airports.,,2020-01-27,Afghanistan MoPH,Government,https://www.facebook.com/af.moph/photos/a.4625...,2020-04-07,
4,23,AFG,Afghanistan,Asia,,,Introduction / extension of measures,Movement restrictions,Border checks,checked,All China and Iran nationals,,2020-02-01,US Embassy,Government,https://af.usembassy.gov/covid-19-information/,2020-03-14,


In [4]:
# Check the row and column count of the dataframe
gov_measures.shape

(23923, 18)

In [5]:
# Check the info of the data structure
gov_measures.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23923 entries, 0 to 23922
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   ID                  23923 non-null  int64         
 1   ISO                 23923 non-null  object        
 2   COUNTRY             23923 non-null  object        
 3   REGION              23923 non-null  object        
 4   ADMIN_LEVEL_NAME    3682 non-null   object        
 5   PCODE               0 non-null      float64       
 6   LOG_TYPE            23923 non-null  object        
 7   CATEGORY            23923 non-null  object        
 8   MEASURE             23923 non-null  object        
 9   TARGETED_POP_GROUP  7556 non-null   object        
 10  COMMENTS            23799 non-null  object        
 11  NON_COMPLIANCE      22764 non-null  object        
 12  DATE_IMPLEMENTED    23630 non-null  datetime64[ns]
 13  SOURCE              23900 non-null  object    

In [6]:
# Check the null values in all columns
gov_measures.isnull().sum()

ID                        0
ISO                       0
COUNTRY                   0
REGION                    0
ADMIN_LEVEL_NAME      20241
PCODE                 23923
LOG_TYPE                  0
CATEGORY                  0
MEASURE                   0
TARGETED_POP_GROUP    16367
COMMENTS                124
NON_COMPLIANCE         1159
DATE_IMPLEMENTED        293
SOURCE                   23
SOURCE_TYPE              11
LINK                     33
ENTRY_DATE                0
Alternative source    22144
dtype: int64

In [7]:
# Check the ID column
gov_measures[['ID','COUNTRY']]

Unnamed: 0,ID,COUNTRY
0,4245,Afghanistan
1,4246,Afghanistan
2,4247,Afghanistan
3,4248,Afghanistan
4,23,Afghanistan
...,...,...
23918,13724,Zimbabwe
23919,14735,Zimbabwe
23920,14734,Zimbabwe
23921,13723,Zimbabwe


In [8]:
# Since we are going to deal with country wise data, the ID column contains multiple Id for one country.
# Hence we are dropping this column from the dataframe
gov_measures.drop('ID', inplace=True, axis=1)

In [9]:
# Check the ADMIN_LEVEL_NAME column
gov_measures.ADMIN_LEVEL_NAME.unique()

array([nan, 'Kabul', 'Herat Province', ..., 'Houthi rebel group',
       'Houthi regions', 'Nakonde'], dtype=object)

In [10]:
# Since the ADMIN_LEVEL_NAME column scarsely populated city names we are dropping this column from the dataframe
gov_measures.drop('ADMIN_LEVEL_NAME', inplace=True, axis=1)

In [11]:
# Check the PCODE column
gov_measures.PCODE.unique()

array([nan])

In [12]:
# Since this field does not contain any value in it we are dropping this column from the dataframe
gov_measures.drop('PCODE', inplace=True, axis=1)

In [13]:
# Check the TARGETED_POP_GROUP column
gov_measures.TARGETED_POP_GROUP.unique()

array([nan, 'checked'], dtype=object)

In [14]:
# Since the TARGETED_POP_GROUP column scarsely populated checked/blank values we are dropping this column from 
# the dataframe.It is evident that this column will not be helpful in the country wise analysis
gov_measures.drop('TARGETED_POP_GROUP', inplace=True, axis=1)

In [15]:
# Check the COMMENTS column
gov_measures.COMMENTS.unique()

array(['MoPH begins announcements on their facebook to make public aware of coronavirus. ',
       'Health teams at airports will check passengers coming from China. ',
       'Flights to China are suspended. ', ...,
       "Restaurants now allowed to serve sit-in meals at 50 percent of the restaurant's sitting capacity.",
       'the re-opening of schools be moved from the proposed 29 June 2020 to 28 July 2020',
       'Passengers will undergo temperature checks at the airport.'],
      dtype=object)

In [16]:
# Since we are going to deal with country wise statistical data, 
# the COMMENTS column will not be helpful for the analysis as it contains only string comments
# Hence we are dropping this column from the dataframe
gov_measures.drop('COMMENTS', inplace=True, axis=1)

In [17]:
# Since we are going to deal with country wise statistical data, 
# the SOURCE column contains the details of the information actual source which will not be helpful for the analysis
# Hence we are dropping this column from the dataframe
gov_measures.drop('SOURCE', inplace=True, axis=1)

In [18]:
# Check the LINK column
gov_measures.LINK.unique()

array(['https://www.facebook.com/af.moph/posts/1005130256522820?__tn__=-R',
       'https://www.facebook.com/af.moph/posts/1006458563056656?__tn__=-R ',
       'https://www.facebook.com/af.moph/photos/a.462598094109375/1007117789657400/?type=3&theater',
       ...,
       'https://twitter.com/MinOfInfoZW/status/1270378086475927555?s=20',
       'https://twitter.com/MinOfInfoZW/status/1278043785079988229?s=20',
       'https://www.emirates.com/za/english/help/faqs/travel-advisory-zimbabwe/'],
      dtype=object)

In [19]:
# Since we are going to deal with country wise statistical data, 
# the LINK column contains the website links of the avtual source which will not be helpful for the analysis
# Hence we are dropping this column from the dataframe
gov_measures.drop('LINK', inplace=True, axis=1)

In [20]:
# Check the ENTRY_DATE column
gov_measures.ENTRY_DATE.unique()

array(['2020-04-07T00:00:00.000000000', '2020-03-14T00:00:00.000000000',
       '2020-05-11T00:00:00.000000000', '2020-05-18T00:00:00.000000000',
       '2020-07-07T00:00:00.000000000', '2020-08-21T00:00:00.000000000',
       '2020-09-25T00:00:00.000000000', '2020-04-15T00:00:00.000000000',
       '2020-04-24T00:00:00.000000000', '2020-03-20T00:00:00.000000000',
       '2020-03-24T00:00:00.000000000', '2020-04-03T00:00:00.000000000',
       '2020-05-04T00:00:00.000000000', '2020-05-22T00:00:00.000000000',
       '2020-05-29T00:00:00.000000000', '2020-06-04T00:00:00.000000000',
       '2020-06-18T00:00:00.000000000', '2020-07-28T00:00:00.000000000',
       '2020-08-28T00:00:00.000000000', '2020-09-26T00:00:00.000000000',
       '2020-10-27T00:00:00.000000000', '2020-11-20T00:00:00.000000000',
       '2020-12-06T00:00:00.000000000', '2020-03-26T00:00:00.000000000',
       '2020-04-28T00:00:00.000000000', '2020-05-05T00:00:00.000000000',
       '2020-05-13T00:00:00.000000000', '2020-06-11

In [21]:
# Since we are going to deal with country wise statistical data, 
# the ENTRY_DATE column contains the entry date of the information on the source website 
# which will not be helpful for the analysis
# Hence we are dropping this column from the dataframe
gov_measures.drop('ENTRY_DATE', inplace=True, axis=1)

In [22]:
# Check the Alternative source column
gov_measures['Alternative source'].unique()

array([nan, 'http://tinyurl.com/twpqsek ',
       'https://www.facebook.com/af.moph/posts/1035892706779908?__tn__=-R',
       ..., 'https://www.bbc.com/news/topics/crr7mlg0rpvt/zimbabwe',
       'https://www.garda.com/crisis24/news-alerts/339441/zimbabwe-face-masks-mandatory-in-public-spaces-may-4-update-7',
       'https://zw.usembassy.gov/covid-19-information-2/'], dtype=object)

In [23]:
# Since we are going to deal with country wise statistical data, 
# the Alternative source column contains secondary source information which will not be helpful for the analysis
# Hence we are dropping this column from the dataframe
gov_measures.drop('Alternative source', inplace=True, axis=1)

In [24]:
# Final dataset with actual columns to analyse
gov_measures.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23923 entries, 0 to 23922
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   ISO               23923 non-null  object        
 1   COUNTRY           23923 non-null  object        
 2   REGION            23923 non-null  object        
 3   LOG_TYPE          23923 non-null  object        
 4   CATEGORY          23923 non-null  object        
 5   MEASURE           23923 non-null  object        
 6   NON_COMPLIANCE    22764 non-null  object        
 7   DATE_IMPLEMENTED  23630 non-null  datetime64[ns]
 8   SOURCE_TYPE       23912 non-null  object        
dtypes: datetime64[ns](1), object(8)
memory usage: 1.6+ MB


In [25]:
# Final dataset all column null check
gov_measures.isnull().sum()

ISO                    0
COUNTRY                0
REGION                 0
LOG_TYPE               0
CATEGORY               0
MEASURE                0
NON_COMPLIANCE      1159
DATE_IMPLEMENTED     293
SOURCE_TYPE           11
dtype: int64

In [26]:
# Renaming of columns
gov_measures.rename(columns = {'SOURCE_TYPE':'SOURCE'}, inplace = True)
gov_measures.rename(columns = {'LOG_TYPE':'EXISTING_MEASURE_OR_NEW_MEASURE'}, inplace = True)

In [27]:
# Check the SOURCE column
gov_measures['SOURCE'].value_counts(dropna=False)

Government             15624
Media                   5681
Other organisations     1462
Social media             525
Other Organisations      424
UN                       107
Social Media              30
Other                     30
media                     23
NaN                       11
other                      6
Name: SOURCE, dtype: int64

In [28]:
# To correct the inconsistent values in source column converting the column values to lowercase
gov_measures['SOURCE'] = gov_measures['SOURCE'].str.lower()

In [29]:
# convert the column ISO to string datatype
gov_measures['ISO'] = gov_measures['ISO'].astype('string')
# convert the column COUNTRY to string datatype
gov_measures['COUNTRY'] = gov_measures['COUNTRY'].astype('string')
# convert the column REGION to string datatype
gov_measures['REGION'] = gov_measures['REGION'].astype('string')
# convert the column EXISTING_MEASURE_OR_NEW_MEASURE to string datatype
gov_measures['EXISTING_MEASURE_OR_NEW_MEASURE'] = gov_measures['EXISTING_MEASURE_OR_NEW_MEASURE'].astype('string')
# convert the column MEASURE to string datatype
gov_measures['CATEGORY'] = gov_measures['CATEGORY'].astype('string')
# convert the column MEASURE to string datatype
gov_measures['MEASURE'] = gov_measures['MEASURE'].astype('string')
# convert the column NON_COMPLIANCE to string datatype
gov_measures['NON_COMPLIANCE'] = gov_measures['NON_COMPLIANCE'].astype('string')
# convert the column SOURCE to string datatype
gov_measures['SOURCE'] = gov_measures['SOURCE'].astype('string')

In [30]:
# Final dataset all column null check
gov_measures.isnull().sum()

ISO                                   0
COUNTRY                               0
REGION                                0
EXISTING_MEASURE_OR_NEW_MEASURE       0
CATEGORY                              0
MEASURE                               0
NON_COMPLIANCE                     1159
DATE_IMPLEMENTED                    293
SOURCE                               11
dtype: int64

In [31]:
# Check for the unique values in NON_COMPLIANCE column
gov_measures.NON_COMPLIANCE.value_counts(dropna=False)

Not applicable                  13256
Not available                    3350
Not Available                    1779
Not Applicable                   1708
NaN                              1159
Fines                             748
Refusal to Enter the Country      432
Up to detention                   414
Other (add in comments)           343
Arrest/Detention                  235
Refusal to enter the country      222
fines                             114
Legal action                       77
not applicable                     24
Other                              20
Deportation                        10
Up to Detention                     9
Legal Action                        8
not available                       7
Not available                       6
Legal                               2
Name: NON_COMPLIANCE, dtype: Int64

In [32]:
# Correct the NaN values on the NON_COMPLIANCE column into 'non-specified'
gov_measures['NON_COMPLIANCE'] = np.where(gov_measures['NON_COMPLIANCE'].isnull(),'non-specified',gov_measures['NON_COMPLIANCE'])

In [33]:
# To address the inconsistent value issue. The values are converted to lower case
gov_measures['NON_COMPLIANCE'] = gov_measures['NON_COMPLIANCE'].str.lower()
# Check for the unique values in NON_COMPLIANCE column post value converted to lower case
gov_measures.NON_COMPLIANCE.unique()

array(['non-specified', 'not applicable', 'not available',
       'not available ', 'fines', 'up to detention',
       'other (add in comments)', 'refusal to enter the country',
       'arrest/detention', 'legal action', 'deportation', 'other',
       'legal'], dtype=object)

In [34]:
# finding the best match from the rest of all values with a specific threshold 0.85 and can be lowered if there is no match found
print(difflib.get_close_matches('legal action',set(gov_measures['NON_COMPLIANCE']).difference(['legal action']), n = 1,cutoff=0.55))
print(difflib.get_close_matches('not available',set(gov_measures['NON_COMPLIANCE']).difference(['not available']), n = 1,cutoff=0.65))
print(difflib.get_close_matches('other',set(gov_measures['NON_COMPLIANCE']).difference(['other']), n = 1,cutoff=0.25))

['legal']
['not available ']
['other (add in comments)']


In [35]:
# Few typos where discovered using the above code which are added to the below dictionary
replace_dict = {'legal':'legal action', 
                'other':'other (add in comments)',
                'not available ' : 'not available'}
# Replace the values given in the dictionary
gov_measures['NON_COMPLIANCE'].replace(replace_dict,inplace=True)

In [36]:
# Check for the unique values in NON_COMPLIANCE column
gov_measures.NON_COMPLIANCE.value_counts(dropna=False)

not applicable                  14988
not available                    5142
non-specified                    1159
fines                             862
refusal to enter the country      654
up to detention                   423
other (add in comments)           363
arrest/detention                  235
legal action                       87
deportation                        10
Name: NON_COMPLIANCE, dtype: int64

In [37]:
# Check for the unique values in CATEGORY column
gov_measures.CATEGORY.value_counts(dropna=False)

Public health measures                    7954
Social distancing                         5540
Movement restrictions                     5159
Governance and socio-economic measures    4364
Lockdown                                   877
Humanitarian exemption                      29
Name: CATEGORY, dtype: Int64

In [38]:
# Check for the unique values in MEASURE column
gov_measures.MEASURE.value_counts(dropna=False)

Economic measures                                               2980
Closure of businesses and public services                       2298
Limit public gatherings                                         2229
Strengthening the public health system                          1850
Isolation and quarantine policies                               1370
Domestic travel restrictions                                    1018
General recommendations                                          936
Border closure                                                   896
Other public health measures enforced                            894
Schools closure                                                  873
Curfews                                                          782
Awareness campaigns                                              780
International flights suspension                                 752
Emergency administrative structures activated or established     733
Requirement to wear protective gea

In [39]:
# Correct the 'non-specified' values on the NON_COMPLIANCE column into NaN
gov_measures['NON_COMPLIANCE'] = np.where(gov_measures['NON_COMPLIANCE']=='non-specified',None,gov_measures['NON_COMPLIANCE'])

In [40]:
# Checking the Distribution of values
#test = gov_measures.groupby(['CATEGORY','MEASURE','NON_COMPLIANCE']).size().reset_index().rename(columns={0:'count'})

In [41]:
# check of missing values in column NON_COMPLIANCE
gov_measures['NON_COMPLIANCE'].isnull().sum()

1159

In [42]:
# grouping based on CATEGORY, MEASURE and NON_COMPLIANCE
groups = gov_measures.groupby(['CATEGORY','MEASURE'])

In [43]:
# all the rows within a group is NaN
all_na = groups['NON_COMPLIANCE'].transform(lambda x: x.isna().all())

In [44]:
# fill global mode
gov_measures.loc[all_na, 'NON_COMPLIANCE'] = gov_measures['NON_COMPLIANCE'].mode()[0]

In [45]:
# fill with local mode
mode_by_group = groups['NON_COMPLIANCE'].transform(lambda x: x.mode()[0])
gov_measures['NON_COMPLIANCE'] = gov_measures['NON_COMPLIANCE'].fillna(mode_by_group)

In [46]:
# Check the null values in the df 
gov_measures.isnull().sum()

ISO                                  0
COUNTRY                              0
REGION                               0
EXISTING_MEASURE_OR_NEW_MEASURE      0
CATEGORY                             0
MEASURE                              0
NON_COMPLIANCE                       0
DATE_IMPLEMENTED                   293
SOURCE                              11
dtype: int64

In [47]:
# Check for missing values in column SOURCE
gov_measures['SOURCE'].value_counts(dropna=False)

government             15624
media                   5704
other organisations     1886
social media             555
un                       107
other                     36
NaN                       11
Name: SOURCE, dtype: Int64

In [48]:
# Drop the records having missing values
gov_measures = gov_measures[gov_measures['SOURCE'].notna()]

In [49]:
# Check for missing values again in column post the dropna function implementation
gov_measures['SOURCE'].value_counts(dropna=False)

government             15624
media                   5704
other organisations     1886
social media             555
un                       107
other                     36
Name: SOURCE, dtype: Int64

In [50]:
# Check for missing values in column DATE_IMPLEMENTED
gov_measures['DATE_IMPLEMENTED'].value_counts(dropna=False)

2020-03-16    342
2020-05-11    336
NaT           289
2020-03-20    282
2020-06-01    279
             ... 
2020-01-06      1
2020-12-16      1
2021-01-15      1
2021-01-04      1
2020-12-15      1
Name: DATE_IMPLEMENTED, Length: 353, dtype: int64

In [51]:
# Drop the records having missing values
gov_measures = gov_measures[gov_measures['DATE_IMPLEMENTED'].notna()]

In [52]:
# Check for missing values again in column post the dropna function implementation
gov_measures['DATE_IMPLEMENTED'].value_counts(dropna=False)

2020-03-16    342
2020-05-11    336
2020-03-20    282
2020-06-01    279
2020-05-18    270
             ... 
2020-01-06      1
2020-12-16      1
2021-01-15      1
2021-01-04      1
2020-12-15      1
Name: DATE_IMPLEMENTED, Length: 352, dtype: int64

In [53]:
#Converting the data field from format YYYY-MM-DD to YYYY-MM
gov_measures['DATE_IMPLEMENTED'] = gov_measures['DATE_IMPLEMENTED'].dt.to_period('M')

In [54]:
# Check for missing values again in column post the dropna function implementation
gov_measures['SOURCE'].value_counts(dropna=False)

government             15427
media                   5661
other organisations     1840
social media             554
un                       107
other                     34
Name: SOURCE, dtype: Int64

In [55]:
# Dropping the source column since it is not useful for our analysis
gov_measures.drop('SOURCE', inplace=True, axis=1)

In [56]:
# Dropping the 'EXISTING_MEASURE_OR_NEW_MEASURE' column since it is not useful for our analysis
gov_measures.drop('EXISTING_MEASURE_OR_NEW_MEASURE', inplace=True, axis=1)

In [57]:
# Dropping the 'NON_COMPLIANCE' column since we have location column
gov_measures.drop('NON_COMPLIANCE', inplace=True, axis=1)

#### Filter g20 countries

In [58]:
g20_countries = ['Argentina', 'Australia', 'Brazil', 'Canada', 'China', 'Germany', 'France',
                 'India', 'Indonesia', 'Italy', 'Japan', 'Mexico', 'Russia', 'Saudi Arabia', 
                 'South Africa', 'South Korea', 'Turkey', 'United Kingdom' ,'United States']

In [59]:
gov_measures_subset = gov_measures[gov_measures['COUNTRY'].isin(g20_countries)]

In [60]:
gov_measures_subset['COUNTRY']=gov_measures_subset['COUNTRY'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gov_measures_subset['COUNTRY']=gov_measures_subset['COUNTRY'].str.strip()


In [61]:
# Check the null values in the df 
gov_measures_subset.isnull().sum()

ISO                 0
COUNTRY             0
REGION              0
CATEGORY            0
MEASURE             0
DATE_IMPLEMENTED    0
dtype: int64

In [62]:
# Final cleaned dataframe
gov_measures_subset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3946 entries, 660 to 23362
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype    
---  ------            --------------  -----    
 0   ISO               3946 non-null   string   
 1   COUNTRY           3946 non-null   string   
 2   REGION            3946 non-null   string   
 3   CATEGORY          3946 non-null   string   
 4   MEASURE           3946 non-null   string   
 5   DATE_IMPLEMENTED  3946 non-null   period[M]
dtypes: period[M](1), string(5)
memory usage: 215.8 KB


In [63]:
#Rename the CATEGORY column to MAIN_MEASURES gov_measures_subset
gov_measures_subset.rename(columns = {'CATEGORY':'MAIN_MEASURES'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gov_measures_subset.rename(columns = {'CATEGORY':'MAIN_MEASURES'}, inplace = True)


In [64]:
#Rename the MEASURES column to STEPS_TAKEN gov_measures_subset
gov_measures_subset.rename(columns = {'MEASURE':'STEPS_TAKEN'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gov_measures_subset.rename(columns = {'MEASURE':'STEPS_TAKEN'}, inplace = True)


In [65]:
#renaming the REGION to be uniform across all data sets
replace_dict = {'Americas':'South America', 'Middle east':'Asia', 'Pacific':'Oceania'}
gov_measures_subset['REGION'].replace(replace_dict,inplace=True)
gov_measures_subset['REGION'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gov_measures_subset['REGION'].replace(replace_dict,inplace=True)


Europe           1415
South America    1174
Asia              710
Oceania           554
Africa             93
Name: REGION, dtype: int64

In [66]:
gov_measures_subset.loc[gov_measures_subset['COUNTRY'] == 'Canada', "REGION"] = "North America"
gov_measures_subset.loc[gov_measures_subset['COUNTRY'] == 'United States', "REGION"] = "North America"
gov_measures_subset.loc[gov_measures_subset['COUNTRY'] == 'Mexico', "REGION"] = "North America"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gov_measures_subset.loc[gov_measures_subset['COUNTRY'] == 'Canada', "REGION"] = "North America"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gov_measures_subset.loc[gov_measures_subset['COUNTRY'] == 'United States', "REGION"] = "North America"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gov_me

In [67]:
# Drop duplicate records from final dataset
gov_measures_subset = gov_measures_subset.drop_duplicates(subset=None, keep = 'first')

In [68]:
gov_measures_subset

Unnamed: 0,ISO,COUNTRY,REGION,MAIN_MEASURES,STEPS_TAKEN,DATE_IMPLEMENTED
660,ARG,Argentina,South America,Public health measures,Awareness campaigns,2020-02
661,ARG,Argentina,South America,Movement restrictions,Visa restrictions,2020-03
662,ARG,Argentina,South America,Movement restrictions,International flights suspension,2020-03
663,ARG,Argentina,South America,Public health measures,Isolation and quarantine policies,2020-03
664,ARG,Argentina,South America,Social distancing,Limit public gatherings,2020-03
...,...,...,...,...,...,...
23330,USA,United States,North America,Public health measures,Other public health measures enforced,2020-11
23349,USA,United States,North America,Public health measures,Strengthening the public health system,2020-12
23351,USA,United States,North America,Public health measures,General recommendations,2020-12
23352,USA,United States,North America,Governance and socio-economic measures,Economic measures,2020-12


In [69]:
# Writing the clean dataframe to csv file
gov_measures_subset.to_csv('../cleaned_csv/cleaned_gov_measured.csv', index=False)