In [217]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [218]:
filename = 'worldometer_coronavirus_daily_data.csv'
df = pd.read_csv(filename)
mob_df = pd.read_csv('mobility trends datasets/country.csv')
#https://www.kaggle.com/josephassaker/covid19-global-dataset

In [219]:
df

Unnamed: 0,date,country,cumulative_total_cases,daily_new_cases,active_cases,cumulative_total_deaths,daily_new_deaths
0,2020-2-15,Afghanistan,0.0,,0.0,0.0,
1,2020-2-16,Afghanistan,0.0,,0.0,0.0,
2,2020-2-17,Afghanistan,0.0,,0.0,0.0,
3,2020-2-18,Afghanistan,0.0,,0.0,0.0,
4,2020-2-19,Afghanistan,0.0,,0.0,0.0,
...,...,...,...,...,...,...,...
84115,2021-2-28,Zimbabwe,36089.0,31.0,1960.0,1463.0,0.0
84116,2021-3-01,Zimbabwe,36115.0,26.0,1742.0,1468.0,5.0
84117,2021-3-02,Zimbabwe,36148.0,33.0,1687.0,1472.0,4.0
84118,2021-3-03,Zimbabwe,36179.0,31.0,1309.0,1478.0,6.0


### First we eliminate the countries that we won't use

In [220]:
setmob = set(mob_df.country.unique())
setcov = set(df.country.unique())
setmob-setcov

{'Antigua and Barbuda',
 'Bosnia and Herzegovina',
 'Cape Verde',
 'Czechia',
 "Côte d'Ivoire",
 'Hong Kong',
 'North Macedonia',
 'Puerto Rico',
 'The Bahamas',
 'Trinidad and Tobago',
 'United Kingdom',
 'United States',
 'Vietnam'}

These are the countries that are unique to the mobility database. As we can see, a few naming problems.

In [221]:
df = df.replace('USA', 'United States')
df = df.replace('UK', 'United Kingdom')
df = df.replace('Viet Nam', 'Vietnam')
df = df.replace("Cote D Ivoire", "Côte d'Ivoire")
df = df.replace('Czech Republic', 'Czechia')
df = df.replace('Bahamas', 'The Bahamas')
df = df.replace('China Hong Kong Sar', 'Hong Kong')
df = df.replace('Trinidad And Tobago', 'Trinidad and Tobago')

After equating the misnamed countries, these remain:

In [222]:
setmob = set(mob_df.country.unique())
setcov = set(df.country.unique())
setmob-setcov

{'Antigua and Barbuda',
 'Bosnia and Herzegovina',
 'Cape Verde',
 'North Macedonia',
 'Puerto Rico'}

In [223]:
country_mask_covid = df.country.isin(mob_df.country.unique())
country_mask_mobility = mob_df.country.isin(df.country.unique())
df = df.loc[country_mask_covid]
mob_df = mob_df.loc[country_mask_mobility]
mob_df = mob_df.reset_index().drop(['index'], axis=1)

In [224]:
for i in df.country.unique():
    df.loc[df.country==i] = df.loc[df.country==i].interpolate()

### Then we treat for dates

These are the maximum and minimum dates for our mobility dataset

In [225]:
max(mob_df.date)

'2021-02-23'

In [226]:
min(mob_df.date)

'2020-09-11'

In [227]:
df.loc[:,'date']=pd.to_datetime(df['date']).values

In [228]:
df = df.loc[df['date']>=np.datetime64('2020-09-11').astype('datetime64[ns]')]
df = df.loc[df['date']<=np.datetime64('2021-02-23').astype('datetime64[ns]')]
df = df.reset_index()
df = df.drop(['index'], axis=1)

Now we check for the dataset length to see if everything is alright

In [229]:
len(df)

17098

In [230]:
len(mob_df)

17098

### Now the NaNs

In [231]:
df.isna().sum()

date                         0
country                      0
cumulative_total_cases       0
daily_new_cases              0
active_cases               332
cumulative_total_deaths    332
daily_new_deaths           442
dtype: int64

In [232]:
aux = df[df.isnull().any(axis=1)]
aux.country.unique()

array(['Cambodia', 'Laos', 'Mongolia', 'Netherlands', 'Sweden'],
      dtype=object)

Mongolia has NaNs on daily new deaths, but cumulative deaths show their first death is after the string of NaNs. So it must be filled with 0

In [233]:
df.loc[df.country=='Mongolia'] = df.loc[df.country=='Mongolia'].fillna(0)

Laos and Cambodia don't have any information on the number of deceased, while Sweden and Netherlands don't have that information on the number of active cases. We should take those things into account when doing the regresssions, but I don't think we should eliminate these countries quite yet.

### Joining the dataframes

I didn't know how to do this, so I reordered the countries and joined the dataframes. There certainly is a more efficient way to do this. 

In [247]:
auxdf = pd.DataFrame(columns = df.columns)
auxmob = pd.DataFrame(columns = mob_df.columns)
for i in mob_df.country.unique():
    auxdf = auxdf.append(df.loc[df.country == i]).copy()
    auxmob = auxmob.append(mob_df.loc[mob_df.country == i]).copy()
auxdf = auxdf.reset_index().drop(['index'], axis=1)
auxmob = auxmob.reset_index().drop(['index'], axis=1)
df = auxmob.join(auxdf.drop(['date', 'country'], axis=1))

In [248]:
df

Unnamed: 0,country,date,recreation,grocery and pharmacy,parks,transit,workplaces,residential,cumulative_total_cases,daily_new_cases,active_cases,cumulative_total_deaths,daily_new_deaths
0,United Arab Emirates,2020-09-11,-27.0,-8.0,-48.0,-45.0,-16.0,9.0,77842.0,931.0,8982.0,398.0,0.0
1,United Arab Emirates,2020-09-12,-22.0,-3.0,-39.0,-42.0,-13.0,7.0,78849.0,1007.0,9467.0,399.0,1.0
2,United Arab Emirates,2020-09-13,-19.0,0.0,-32.0,-39.0,-20.0,9.0,79489.0,640.0,9639.0,399.0,0.0
3,United Arab Emirates,2020-09-14,-22.0,-4.0,-36.0,-40.0,-19.0,10.0,80266.0,777.0,9886.0,399.0,0.0
4,United Arab Emirates,2020-09-15,-22.0,-4.0,-36.0,-40.0,-19.0,10.0,80940.0,674.0,9904.0,401.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17093,Zimbabwe,2021-02-19,-20.0,6.0,-15.0,-29.0,-23.0,11.0,35710.0,167.0,2308.0,1430.0,10.0
17094,Zimbabwe,2021-02-20,-21.0,10.0,-4.0,-30.0,-14.0,6.0,35768.0,58.0,2240.0,1432.0,2.0
17095,Zimbabwe,2021-02-21,-20.0,6.0,2.0,-33.0,-13.0,9.0,35796.0,28.0,2235.0,1436.0,4.0
17096,Zimbabwe,2021-02-22,-36.0,-15.0,-17.0,-43.0,-66.0,21.0,35862.0,66.0,2205.0,1441.0,5.0


In [249]:
df.to_csv(r'finished_datasets/country.csv', index = False)