## **Imports and Setting Up**

In [1]:
import pandas as pd

In [2]:
from google.colab import drive
 
#mount g drive
drive.mount('/gdrive')

# navigate to the project's folder
%cd /gdrive/'My Drive'/

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive/My Drive


## **Load owid-covid-data.csv into dataframe**

In [21]:
df = pd.read_csv('CSC3007_datasets/owid-covid-data.csv') 

In [22]:
pd.set_option('display.max_columns', None)

In [23]:
len(df.columns)

67

## **Retrieve only the columns we need from the dataframe**

In [24]:
df = df[["continent", "location", "date", "total_cases", "new_cases", "total_deaths", "new_deaths", "total_vaccinations", "people_vaccinated", "people_fully_vaccinated", "total_boosters", "population", "population_density"]].fillna(0)

In [42]:
len(df)

197476

## **Load world_geojson.json into dataframe**

In [25]:
import json

with open('CSC3007_datasets/world_geojson.json') as data_file:    
    data = json.load(data_file)  

df_geo_world = pd.json_normalize(data, 'features', ['type'], 
                    record_prefix='features_')


In [26]:
df_geo_world.head()

Unnamed: 0,features_type,features_id,features_properties.name,features_geometry.type,features_geometry.coordinates,type
0,Feature,AFG,Afghanistan,Polygon,"[[[61.210817, 35.650072], [62.230651, 35.27066...",FeatureCollection
1,Feature,AGO,Angola,MultiPolygon,"[[[[16.326528, -5.87747], [16.57318, -6.622645...",FeatureCollection
2,Feature,ALB,Albania,Polygon,"[[[20.590247, 41.855404], [20.463175, 41.51508...",FeatureCollection
3,Feature,ARE,United Arab Emirates,Polygon,"[[[51.579519, 24.245497], [51.757441, 24.29407...",FeatureCollection
4,Feature,ARG,Argentina,MultiPolygon,"[[[[-65.5, -55.2], [-66.45, -55.25], [-66.9599...",FeatureCollection


In [27]:
df_geo_world["features_properties.name"]

0               Afghanistan
1                    Angola
2                   Albania
3      United Arab Emirates
4                 Argentina
               ...         
172               West Bank
173                   Yemen
174            South Africa
175                  Zambia
176                Zimbabwe
Name: features_properties.name, Length: 177, dtype: object

## **Only get the rows based on the common values between the column:"location" from the owid-covid-data.csv and in the column:"features_properties.name" from the world_geojson.json**

In [39]:
df_without_duplicates = df.drop_duplicates(subset=['location'])
df_without_duplicates
print(len(df_without_duplicates))

244


In [50]:
common_df=  df_without_duplicates[df_without_duplicates['location'].isin(df_geo_world['features_properties.name'])]

Unnamed: 0,continent,location,date,total_cases,new_cases,total_deaths,new_deaths,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,population,population_density
0,Asia,Afghanistan,2020-02-24,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,39835428.0,54.422
1723,Europe,Albania,2020-02-25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2872934.0,104.871
2578,Africa,Algeria,2020-02-25,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,44616626.0,17.348
4282,Africa,Angola,2020-03-20,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,33933611.0,23.890
6774,South America,Argentina,2020-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,45605823.0,16.177
...,...,...,...,...,...,...,...,...,...,...,...,...,...
192606,Asia,Vietnam,2020-01-23,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,98168829.0,308.127
194112,Africa,Western Sahara,2022-04-20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,611872.0,0.000
195002,Asia,Yemen,2020-04-10,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,30490639.0,53.508
195812,Africa,Zambia,2020-03-18,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,18920657.0,22.995


In [52]:
intersect_locations = common_df['location']

0            Afghanistan
1723             Albania
2578             Algeria
4282              Angola
6774           Argentina
               ...      
192606           Vietnam
194112    Western Sahara
195002             Yemen
195812            Zambia
196645          Zimbabwe
Name: location, Length: 160, dtype: object


In [56]:
final_output = df.loc[df['location'].isin(intersect_locations.values)]
print(len(final_output))

133581


In [55]:
final_output.to_csv("CSC3007_datasets/owid-covid-data_processed.csv", encoding='utf-8')