# Imports

In [41]:
import pandas as pd
import plotly.graph_objs as go
import plotly.express as px
import numpy as np

In [49]:
from sklearn.cluster import KMeans

# Data Reading

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/OxCGRT/covid-policy-tracker/master/data/OxCGRT_latest.csv')


Columns (2,3) have mixed types.Specify dtype option on import or set low_memory=False.



In [3]:
df.head()

Unnamed: 0,CountryName,CountryCode,RegionName,RegionCode,Date,C1_School closing,C1_Flag,C2_Workplace closing,C2_Flag,C3_Cancel public events,...,StringencyIndex,StringencyIndexForDisplay,StringencyLegacyIndex,StringencyLegacyIndexForDisplay,GovernmentResponseIndex,GovernmentResponseIndexForDisplay,ContainmentHealthIndex,ContainmentHealthIndexForDisplay,EconomicSupportIndex,EconomicSupportIndexForDisplay
0,Aruba,ABW,,,20200101,0.0,,0.0,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Aruba,ABW,,,20200102,0.0,,0.0,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Aruba,ABW,,,20200103,0.0,,0.0,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Aruba,ABW,,,20200104,0.0,,0.0,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Aruba,ABW,,,20200105,0.0,,0.0,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Data Engineering

In [129]:
cont_rename = dict(df[['CountryCode', 'CountryName']].drop_duplicates().values)

## a. Getting Daily Case Count per Country

In [23]:
cont_df = df[df['RegionName'].isna()]

In [24]:
cont_df = cont_df.pivot(index='Date', columns='CountryCode', values='ConfirmedCases').fillna(method='ffill').fillna(0)

In [25]:
cont_df

CountryCode,ABW,AFG,AGO,AIA,ALB,AND,ARE,ARG,AUS,AUT,...,USA,UZB,VEN,VGB,VNM,VUT,YEM,ZAF,ZMB,ZWE
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20200101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20200102,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20200103,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20200104,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20200105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200923,3665.0,39145.0,4236.0,3.0,12666.0,1681.0,86447.0,652161.0,26942.0,39897.0,...,6896274.0,53051.0,68453.0,71.0,1068.0,0.0,2032.0,663282.0,14389.0,7711.0
20200924,3721.0,39170.0,4236.0,3.0,12787.0,1753.0,87530.0,664799.0,26973.0,40558.0,...,6934204.0,53667.0,69439.0,71.0,1069.0,0.0,2032.0,665188.0,14443.0,7725.0
20200925,3756.0,39186.0,4363.0,3.0,12921.0,1753.0,88532.0,678266.0,26983.0,41246.0,...,6978417.0,53966.0,70406.0,71.0,1069.0,0.0,2033.0,667049.0,14491.0,7752.0
20200926,3799.0,39192.0,4475.0,3.0,13045.0,1836.0,89540.0,691222.0,27000.0,42317.0,...,7033430.0,54462.0,71273.0,71.0,1069.0,0.0,2033.0,668529.0,14515.0,7787.0


In [26]:
cont_df.index = pd.to_datetime(cont_df.index, format='%Y%m%d')

## Calculating Infection Rate

In [27]:
infect_rate = cont_df.diff().dropna()

# Map Visualization at August 1

In [37]:
fig = px.choropleth(cont_df.loc['2020-08-01'].to_frame('Case Count').reset_index(),
                   locations='CountryCode',
                   color='Case Count')

In [38]:
fig

# Log Map Visualization at August 1

In [44]:
fig = px.choropleth(cont_df.loc['2020-08-01'].apply(lambda x: np.log10(x)).to_frame('Case Count (log)').reset_index(),
                   locations='CountryCode',
                   color='Case Count (log)')

In [45]:
fig

# Infection Rate Map ad August 1

In [46]:
fig = px.choropleth(infect_rate.loc['2020-08-01'].to_frame('Infection Rate').reset_index(),
                   locations='CountryCode',
                   color='Infection Rate')

In [47]:
fig

# Clustering all the countries

In [50]:
kmeans = KMeans()

In [51]:
kmeans.fit(cont_df.T)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=8, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [83]:
sort = pd.DataFrame(kmeans.cluster_centers_).max(axis=1).sort_values(ascending=False).index

In [84]:
sort_dict = {sort[i]:f"Cluster {i+1}" for i in range(len(sort))}

In [85]:
preds = kmeans.predict(cont_df.T)

In [86]:
ser = pd.Series(preds, index=cont_df.columns)

In [87]:
vals = {}
for i, j in ser.items():
    vals.setdefault(sort_dict[j], [])
    vals[sort_dict[j]].append(i)

vals = {i:pd.Series(j) for i,j in vals.items()}

# Cluster Members

In [89]:
pd.DataFrame(vals)[[f"Cluster {i+1}" for i in range(len(sort))]]

Unnamed: 0,Cluster 1,Cluster 2,Cluster 3,Cluster 4,Cluster 5,Cluster 6,Cluster 7,Cluster 8
0,USA,IND,BRA,RUS,COL,ARG,ARE,ABW
1,,,,,MEX,BGD,BEL,AFG
2,,,,,PER,CHL,BLR,AGO
3,,,,,ZAF,DEU,BOL,AIA
4,,,,,,ESP,CAN,ALB
...,...,...,...,...,...,...,...,...
133,,,,,,,,VNM
134,,,,,,,,VUT
135,,,,,,,,YEM
136,,,,,,,,ZMB


# Cluster Averages

In [96]:
averages = pd.DataFrame(kmeans.cluster_centers_.T, index=cont_df.index)

In [97]:
averages = averages.rename(columns = {i:sort_dict[i] for i in averages})[[f"Cluster {i+1}" for i in range(len(sort))]]

In [106]:
averages = averages.stack(level=0).reset_index().rename(columns={"level_1":"Cluster", 0:"Count"})

In [108]:
px.line(averages, x = 'Date', y='Count', color='Cluster')

# Uk, Sweden, Georgia, South Korea

## UK

In [120]:
sort_dict[ser.loc['GBR']]

'Cluster 6'

### UK is in the 6th smallest cluster in the world with the following countries

In [131]:
print(' - ' + '\n - '.join(vals['Cluster 6'].apply(lambda x: cont_rename[x])))

 - Argentina
 - Bangladesh
 - Chile
 - Germany
 - Spain
 - France
 - United Kingdom
 - Iran
 - Italy
 - Pakistan
 - Saudi Arabia
 - Turkey


# Sweden

In [132]:
sort_dict[ser.loc['SWE']]

'Cluster 7'

### Sweden is in the 7th smallest cluster in the world with the following countries

In [134]:
print(' - ' + '\n - '.join(vals['Cluster 7'].apply(lambda x: cont_rename[x])))

 - United Arab Emirates
 - Belgium
 - Belarus
 - Bolivia
 - Canada
 - China
 - Dominican Republic
 - Ecuador
 - Egypt
 - Guatemala
 - Indonesia
 - Iraq
 - Israel
 - Japan
 - Kazakhstan
 - Kuwait
 - Netherlands
 - Oman
 - Panama
 - Philippines
 - Poland
 - Portugal
 - Qatar
 - Romania
 - Singapore
 - Sweden
 - Ukraine


# Georgia & South Korea

In [142]:
sort_dict[ser.loc['GEO']], sort_dict[ser.loc['KOR']]

('Cluster 8', 'Cluster 8')

### Georgia and South Korea are in the 8th smallest cluster in the world with the following countries

In [143]:
print(' - ' + '\n - '.join(vals['Cluster 8'].apply(lambda x: cont_rename[x])))

 - Aruba
 - Afghanistan
 - Angola
 - Anguilla
 - Albania
 - Andorra
 - Australia
 - Austria
 - Azerbaijan
 - Burundi
 - Benin
 - Burkina Faso
 - Bulgaria
 - Bahrain
 - Bosnia and Herzegovina
 - Belize
 - Bermuda
 - Barbados
 - Brunei
 - Bhutan
 - Botswana
 - Central African Republic
 - Switzerland
 - Cote d'Ivoire
 - Cameroon
 - Democratic Republic of Congo
 - Congo
 - Cape Verde
 - Costa Rica
 - Cuba
 - Cayman Islands
 - Cyprus
 - Czech Republic
 - Djibouti
 - Dominica
 - Denmark
 - Algeria
 - Eritrea
 - Estonia
 - Ethiopia
 - Finland
 - Fiji
 - Falkland Islands
 - Gabon
 - Georgia
 - Ghana
 - Gibraltar
 - Guinea
 - Gambia
 - Greece
 - Greenland
 - Guam
 - Guyana
 - Hong Kong
 - Honduras
 - Croatia
 - Haiti
 - Hungary
 - Ireland
 - Iceland
 - Jamaica
 - Jordan
 - Kenya
 - Kyrgyz Republic
 - Cambodia
 - South Korea
 - Laos
 - Lebanon
 - Liberia
 - Libya
 - Sri Lanka
 - Lesotho
 - Lithuania
 - Luxembourg
 - Latvia
 - Macao
 - Morocco
 - Moldova
 - Madagascar
 - Mali
 - Myanmar
 - Mongol