In [3]:
from io import BytesIO
import requests
import pandas as pd

### Fetch & Load the dataset

In [5]:
url = "https://catalog.ourworldindata.org/garden/covid/latest/compact/compact.csv"

# GET Request for the dataset url
response = requests.get(url)
response.raise_for_status()

# Load data to Pandas DataFrame
csv_data = BytesIO(response.content)
covid19_data = pd.read_csv(csv_data)
covid19_data.head()

KeyboardInterrupt: 

### Basic infos about the dataset

In [None]:
# Memory Usage in megabytes
print(f"Dataset's memory size: {covid19_data.memory_usage().sum()/1e6:.1f} MB")

Dataset's memory size: 223.8 MB


In [None]:
# Dataset's Shape & Size
print(f"Number of Columns: {covid19_data.shape[1]}")
print(f"Number of Rows(observations): {covid19_data.shape[0]}")

print(f"Size: {covid19_data.size}")

Number of Columns: 61
Number of Rows(observations): 458548
Size: 27971428


In [None]:
# Features(Columns)'s data types
print(covid19_data.dtypes)

country                        object
date                           object
total_cases                   float64
new_cases                     float64
new_cases_smoothed            float64
                               ...   
extreme_poverty               float64
diabetes_prevalence           float64
handwashing_facilities        float64
hospital_beds_per_thousand    float64
human_development_index       float64
Length: 61, dtype: object


In [None]:
print(covid19_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458548 entries, 0 to 458547
Data columns (total 61 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   country                                     458548 non-null  object 
 1   date                                        458548 non-null  object 
 2   total_cases                                 446750 non-null  float64
 3   new_cases                                   444606 non-null  float64
 4   new_cases_smoothed                          443386 non-null  float64
 5   total_cases_per_million                     446750 non-null  float64
 6   new_cases_per_million                       444606 non-null  float64
 7   new_cases_smoothed_per_million              443386 non-null  float64
 8   total_deaths                                446750 non-null  float64
 9   new_deaths                                  445200 non-null  float64
 

We see that there are 57 numerical cols and 4 categorical ones.

### Dates

In [None]:
covid19_data['date'] = pd.to_datetime(covid19_data['date'])

In [None]:
covid19_data.date.agg(['min','max'])

min   2020-01-01
max   2024-12-31
Name: date, dtype: datetime64[ns]

We see that this dataset contains data about covid-19 from 2020/01/01 to 2024/12/31.<br>
Five Years.

### Relevant Features
this dataset has many features 60 , we will select only these for analysis:

In [None]:
cols = ['country', 'date', 'total_cases', 'new_cases', 'total_deaths', 'new_deaths', 'total_cases_per_million', 'total_deaths_per_million', 'total_vaccinations', 'people_vaccinated', 'people_fully_vaccinated', 'new_vaccinations', 'total_tests', 'new_tests', 'positive_rate', 'population', 'median_age', 'gdp_per_capita']
covid19_data = covid19_data[cols]
covid19_data.tail()

Unnamed: 0,country,date,total_cases,new_cases,total_deaths,new_deaths,total_cases_per_million,total_deaths_per_million,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,total_tests,new_tests,positive_rate,population,median_age,gdp_per_capita
458543,Zimbabwe,2024-11-20,266396.0,0.0,5740.0,0.0,16578.193,357.2082,,,,,,,,16069061.0,17.683,2207.957
458544,Zimbabwe,2024-11-21,266396.0,0.0,5740.0,0.0,16578.193,357.2082,,,,,,,,16069061.0,17.683,2207.957
458545,Zimbabwe,2024-11-22,266396.0,0.0,5740.0,0.0,16578.193,357.2082,,,,,,,,16069061.0,17.683,2207.957
458546,Zimbabwe,2024-11-23,266396.0,0.0,5740.0,0.0,16578.193,357.2082,,,,,,,,16069061.0,17.683,2207.957
458547,Zimbabwe,2024-11-24,266396.0,0.0,5740.0,0.0,16578.193,357.2082,,,,,,,,16069061.0,17.683,2207.957


### Sampling
We see that this dataset is big with nearly half million observations, which is too big for EDA.So to reduce the dataset size and focus our Exploratory Data Analysis.<br>
We will select few countries and focus on them.

In [None]:
covid19_data.country.unique()

array(['Afghanistan', 'Africa', 'Albania', 'Algeria', 'American Samoa',
       'Andorra', 'Angola', 'Anguilla', 'Antigua and Barbuda',
       'Argentina', 'Armenia', 'Aruba', 'Asia', 'Asia excl. China',
       'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain',
       'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin',
       'Bermuda', 'Bhutan', 'Bolivia', 'Bonaire Sint Eustatius and Saba',
       'Bosnia and Herzegovina', 'Botswana', 'Brazil',
       'British Virgin Islands', 'Brunei', 'Bulgaria', 'Burkina Faso',
       'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Cape Verde',
       'Cayman Islands', 'Central African Republic', 'Chad', 'Chile',
       'China', 'Colombia', 'Comoros', 'Congo', 'Cook Islands',
       'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cuba', 'Curacao',
       'Cyprus', 'Czechia', 'Democratic Republic of Congo', 'Denmark',
       'Djibouti', 'Dominica', 'Dominican Republic', 'East Timor',
       'Ecuador', 'Egypt', 'El Salvador', 'England

We will continue with **Algeria**, **Saudi Arabia**, **Russia**, **Greenland**, **South Africa** and **Argentina**.

In [None]:
selected_countries = ['Algeria',"Saudi Arabia","Russia","Greenland","South Africa","Argentina"]
subset_data = covid19_data[covid19_data['country'].isin(selected_countries)]

### Upload the Raw Data to the API

In [None]:
# Convert the DataFrame to Json format for transmition
subset_data_json = subset_data.to_json()

In [None]:
raw_data_url = "http://127.0.0.1:5000/raw_data"
response = requests.post(raw_data_url,json=subset_data_json)
print(response.status_code)

201
